﻿-- Demo script for the session Introduction to Regular Expressions in 
-- SQL Server.
-- © 2026 Erland Sommarskog.
--------------------------------------------------------------
-- Lesson One.
USE tempdb;
go
-- Create demo database. Two things are important with the collation:
-- It is case-insensitive and it is UTF8, so that we have Unicode support
-- in varchar and do not need N before string literals.
DROP DATABASE IF EXISTS regexp;
CREATE DATABASE regexp COLLATE Latin1_General_100_CI_AS_SC_UTF8;
go
USE regexp;
go
-- Create simple demo table and add some words.
DROP TABLE IF EXISTS words;
CREATE TABLE words (word varchar(40) NOT NULL
                    CONSTRAINT pk_words PRIMARY KEY (word)  
);
go
-- How to use regexp_like: It's *boolean* function, so you don't
-- compare the return value with anything.
SELECT word FROM words WHERE regexp_like(word, 'No such');
go
-- You cannot use it in a SELECT list directly:
SELECT regexp_like('String', 'No match');
go
-- Must wrap in IIF or CASE:
SELECT IIF(regexp_like('String', 'No match'), 1, 0);
go
INSERT words (word)
  VALUES('A'), ('atomic'), ('ballad'), ('banana'), ('box'), ('Grimsby'), 
        ('Micro$oft'), ('résumé'), ('Saab900'), ('Super-8');
go
SELECT word FROM words;
go
-- Comparing LIKE and regexp_like.
SELECT word FROM words WHERE word LIKE 'a';
SELECT word FROM words WHERE regexp_like(word, 'a');
-- Results are entirely disjunct. LIKE only matches "A", since there are
-- no trailing and leading wildcards. Regexps on the other hand match anywhere 
-- in the string. LIKE understands collations and here we a CI collation 
-- so "a" = "A". Regexps default to be case-sensitive are blind to collations.
go
-- Add trailing/leading wildcard with LIKE, and 'i' flag to regexp_like 
-- to force case-insensitive. Results are now the same.
SELECT word FROM words WHERE word LIKE '%a%';
SELECT word FROM words WHERE regexp_like(word, 'a', 'i');
go
-- We can use ^ to anchor a regexp to the start of the string.
SELECT word FROM words WHERE word LIKE 'a%';
SELECT word FROM words WHERE regexp_like(word, '^a', 'i');
go
-- And $ matches at the end of the string.
SELECT word FROM words WHERE word LIKE '%a';
SELECT word FROM words WHERE regexp_like(word, 'a$', 'i');
go
-- We can use both.
SELECT word FROM words WHERE regexp_like(word, '^a$', 'i');
go
-- In LIKE patterns, underscore stands for "any character", exactly once.
-- In regular expressions, the dot "." serves that role.
SELECT word FROM words WHERE word LIKE '%a_a%';
SELECT word FROM words WHERE regexp_like(word, 'a.a');
go
-- Try to find the word with a dollar character
SELECT word FROM words WHERE regexp_like(word, '$');
go
-- All words were returned. $ means end of string, and all strings have an
-- an end. We need to escape it with a backslash to match the $ as such.
SELECT word FROM words WHERE regexp_like(word, '\$');
go
-- The backslash means "take at face value" before ASCII punctuation 
-- character. Backslash + alphanumeric may have a special meaning, 
-- that we will look at later. If not, you get an error like here:
SELECT word FROM words WHERE regexp_like(word, '\o\$\o');
go
--------------------------------------------------------------
-- Lesson Two.
-- In a regular expression, we can use brackets to enclose a group of 
-- alternate characters to match. This works exactly the same way as with 
-- LIKE. List words that end in "x" or "y":
SELECT word FROM words WHERE word LIKE '%[yx]';
SELECT word FROM words WHERE regexp_like(word, '[yx]$');
go
-- We can use ranges. List words that end in a digit:
SELECT word FROM words WHERE word LIKE '%[0-9]';
SELECT word FROM words WHERE regexp_like(word, '[0-9]$');
go
-- We can combine ranges and separate characters. List words that end 
-- in a digit, "x" or "y":
SELECT word FROM words WHERE word LIKE '%[xy0-9]';
SELECT word FROM words WHERE regexp_like(word, '[xy0-9]$');
go
-- You can negate a set, by putting a circumflex after the opening bracket.
-- List words to that do *not* start with "a" or "b":
SELECT word FROM words WHERE word LIKE '[^ab]%';
SELECT word FROM words WHERE regexp_like(word, '^[^ab]', 'i');
-- Yes, confusing in the regexp version: Two "^" with different meanings!
go
-- Beware, that there can be differences due to collation.
-- Return words that end in a character between "d" and "t":
SELECT word FROM words WHERE word LIKE '%[d-t]';
SELECT word FROM words WHERE regexp_like(word, '[d-t]$', 'i');
-- LIKE understands collations, so it considers "é" to be between "d" and "t". 
-- Regxps go by code point, and the code point for "d" is 100, "t" is 116. 
-- "é" = 233 and is outside the range d-t.
go
--------------------------------------------------------
-- Lesson Three.
-- For the this lesson, we will work with a table with sentences.
-- To make the exercise simpler, there is no punctuation, and there are
-- only ASCII words.
DROP TABLE IF EXISTS sentences;
CREATE TABLE sentences (id       int NOT NULL,
                        sentence varchar(200) NOT NULL,
								CONSTRAINT pk_sentences PRIMARY KEY (id)
);
INSERT sentences (id, sentence)
  VALUES (1, 'Jack tells Steven to help Jill'),
         (2, 'Stephen says Jill is in the store'),
         (3, 'Laura hears Jack Jill and Beth singing'),
         (4, 'Jill is watching Laura playing tennis'),
         (5, 'Laura is hoping for Stephen to drop in'),
         (6, 'To whom did Laura give the jack of clubs'),
         (7, 'Steven asks Jack what is that good for'),
         (8, 'Beth and Laura is visiting Jack and Jill');
go
-- Say that we want to find sentences with Jack or Jill. With LIKE we need
-- use two LIKE with OR:
SELECT id, sentence
FROM   sentences
WHERE  sentence LIKE '%Jack%' OR sentence LIKE '%Jill%';
go
-- With regular expressions we can do this more compactly with help of
-- the metacharacter "|", which permits you specify alternate expressions 
-- that can match.
SELECT id, sentence FROM sentences 
WHERE  regexp_like(sentence, 'Jack|Jill');
-- Note that we did not match the sentence with "jack of clubs" since
-- regular expressions are case-sensitive by default.
go
-- Say that we only want to find sentences starting with Jack or Jill.
-- We may try:
SELECT id, sentence FROM sentences 
WHERE regexp_like(sentence, '^Jack|Jill');
-- But this is not the correct result.

-- To solve this, we need yet a metacharacter, the parentheses which sets the
-- precedence order. The "|" has the lowest precedence of all regexp operators.
SELECT id, sentence FROM sentences 
WHERE regexp_like(sentence, '^(Jack|Jill)');
-- Now we get the correct result.
go
-- Say that we want to find all sentences with Steven, who is spelled
-- inconsistently.
SELECT id, sentence FROM sentences 
WHERE regexp_like(sentence, 'Ste(v|ph)en');
go
-- We can nest parentheses and "|". Find all sentences that begins with
-- Jack or Stephen/Steven.
SELECT id, sentence FROM sentences 
WHERE regexp_like(sentence, '^(Jack|Ste(v|ph)en)');
go
-- We can have as many alternates as we want. Find sentences with any of
-- the words "to", "for", "of" or "in". Cover the cases that they may 
-- appear in the beginning or the end of the sentences. Here we use 
-- regexp_substr to return the first and second matches. The angle brackets
-- are there to show the surrounding spacing.
DECLARE @pat varchar(30) = '(^| )(to|for|of|in)( |$)';
SELECT id, sentence, regexp_count(sentence, @pat, 1, 'i') AS cnt,
       '<' + regexp_substr(sentence, @pat, 1, 1, 'i') + '>' AS "1st",
       '<' + regexp_substr(sentence, @pat, 1, 2, 'i') + '>' AS "2nd"
FROM sentences
WHERE regexp_like(sentence, @pat, 'i');
go
-----------------------------------------------------
-- Lesson Four.
-- You may wonder what is the corresponding to LIKE's % in regular 
-- expressions. Here is an emulation of the %, which we use to find words 
-- with at least two "a" in them.
SELECT word FROM words WHERE word LIKE '%a%a%';
SELECT word FROM words WHERE regexp_like(word, 'a.*a');
-- Note here that ".*" are two metacharacters. ".", as we have learnt, 
-- means "any character". The asterisk is a *quantifier*, which means
-- "previous regular expression zero or more times". It is *not* a wildcard!
go
-- There are more quantifiers. Very commonly used is "+" which means 
-- "previous regexp, one or more times". 
-- So this means: All words with two "a" separated by at least one character.
SELECT word FROM words WHERE regexp_like(word, 'a.+a');
go
-- The question mark("?") means "previous regular expression zero or one time".
-- This means: Words with two "a" adjacent to each other or only one
-- character apart.
SELECT word FROM words WHERE regexp_like(word, 'a.?a');
go
-- Let's look at some practical examples, which shows regexp_split_to_table in
-- action. Split a text with inconsistent spacing into words.
SELECT '<' + value + '>', ordinal 
FROM   regexp_split_to_table('A   text   with  lots of   spaces', ' +')
ORDER  BY ordinal;
go
-- This example splits a comma-separated list with inconsistent spacing 
-- into table format, with no trailing or leading spaces.
SELECT '<' + value + '>', ordinal
FROM   regexp_split_to_table('101, 221 ,43,14,  85,  6', ' *, *')
ORDER  BY ordinal;
go
-- Let's play with our sentences. Find all sentences where Jack comes
-- before Jill. We can do this quite simple-minded like this:
SELECT id, sentence 
FROM   sentences 
WHERE  regexp_like(sentence, 'Jack.*Jill');
go
-- But say that we want include spaces to avoid matches in Jackie, MacJill
-- or whatever. First, as before, put alternates with beg/end-of-line and 
-- space before Jack and after Jill. Jack must then be followed by a 
-- space, and Jill must be preceded by one. In between there may be anything.
SELECT id, sentence 
FROM   sentences 
WHERE  regexp_like(sentence, '(^| )Jack .* Jill( |$)');
-- Oops! Missed the sentence with Jill directly after Jack with no word
-- in between. 
go
-- Address this by adding an alternate for this case.
SELECT id, sentence 
FROM   sentences 
WHERE  regexp_like(sentence, '(^| )Jack( .* | )Jill( |$)');
go
-- There are more general quantifiers. This returns all sentences that end
-- with a four-letter word. {4} = previous regexp matches exactly four times.
SELECT id, sentence
FROM   sentences s
WHERE  regexp_like(sentence, ' [A-Za-z]{4}$');
go
-- All sentences that end with a word that is three to five characters long.
SELECT id, sentence
FROM   sentences s
WHERE  regexp_like(sentence, ' [A-Za-z]{3,5}$');
go
-- All final words with six or more characters.
SELECT id, sentence
FROM   sentences s
WHERE  regexp_like(sentence, ' [A-Za-z]{6,}$');
go

-------------------------------------------------------
-- Lesson Five.
-- We will now work with regexp_replace. This is the example from the
-- beginning of the session. Collapse multiple spaces into a single one.
SELECT regexp_replace
       ('A    text with   extra  space', ' +', ' ');
go
-- The replacement string can refer back to the match. We can use "\&" which
-- means "insert the match here". This puts angle brackets around all 
-- occurrences of Jack or Jill.
SELECT regexp_replace(sentence, '(Jack|Jill)', '<\&>')
FROM   sentences;
go
-- Say that we only want to do that when Jack or Jill are in the middle of
-- the sentence. We might try:
SELECT regexp_replace(sentence, ' (Jack|Jill) ', '<\&>')
FROM   sentences;
-- But the angle brackets appear in the wrong place, because the matches 
-- include the spaces.
go
-- The solution: In regular expressions, parentheses serve *two* purposes: 
-- 1) Precedence order 2) Define capture groups. We can refer to the first
-- capture group as \1, the next as \2 etc. This leads to:
SELECT regexp_replace(sentence, ' (Jack|Jill) ', ' <\1> ')
FROM   sentences;
go
-- A common situation is that you want to replace something when it appears
-- in a particular context. Then you need to include the context in your
-- pattern. By using capture groups and \1 etc, you can retain the context.
-- What we want to do is to find sentences where Jack comes between Laura and
-- Jill and replace him with Jack. To repeat the pattern we used previously,
-- here is a query only to find the sentences.
SELECT id, sentence
FROM   sentences
WHERE  regexp_like(sentence, '(^| )Laura( .* | )Jack( .* | )Jill( |$)');
go
-- With regexp_replace we need capture groups. We could work from the existing
-- parentheses and repeat "Laura" and "Jill", but it's better to slap extra
-- parens on everything before and after Jack. Thus:
DECLARE @pat varchar(80) = '((^| )Laura( .* | ))Jack(( .* | )Jill( |$))'
SELECT id, sentence, regexp_replace(sentence, @pat, '\1Steven\4')
FROM   sentences
WHERE  regexp_like(sentence, @pat);
-- Note that we need \4 for the part with Jill. The nested parentheses also
-- define capture groups.
go
-- This example is sort of silly, but it illustrates a common situation that
-- you should be aware of. 
-- In sentences with Jill, we want to change -ing to -ING.
DECLARE @pat varchar(80) = '(Jill .*[a-z])ing( |$)'
SELECT id, sentence, regexp_replace(sentence, @pat, '\1ING\2')
FROM   sentences
WHERE  regexp_like(sentence, @pat);
go
-- Note that in sentence 4, the second -ing was replaced. This is due to that
-- quantifers are "greedy", that is, they match as long as they can. When they
-- no longer match, the regexp backtracks to find the next match. 
--   You can change the behaviour by adding a "?" after the quantifier to make
-- it non-greedy, so that it stops the rest of the regexp starts matching. 
DECLARE @pat varchar(80) = '(Jill .*?[a-z])ing( |$)'
SELECT id, sentence, regexp_replace(sentence, @pat, '\1ING\2')
FROM   sentences
WHERE  regexp_like(sentence, @pat);
-- This situation is particularly common when the pattern before the
-- quantifier is the dot (= any character).
go
------------------------------------------------------
-- Lesson Six.
-- For the this lession, we need yet another table! It's a table where
-- there is a label followed by data matching the label.
DROP TABLE IF EXISTS teststrings;
CREATE TABLE teststrings(
       id  int          NOT NULL,
       str varchar(100) NOT NULL,
       CONSTRAINT pk_teststrings PRIMARY KEY (id)
);
go
INSERT teststrings(id, str)
   VALUES(1,  'Number: 134'),
         (2,  'Number: 789'),
         (3,  'tab:' + nchar(9)),
         (4,  'CR:' + nchar(13)),
         (5,  'LF:' + nchar(10)),
         (6,  'FF:' + nchar(12)),
         (7,  'space:' + nchar(32)),
         (8,  'NBSP:' + nchar(160)),
         (9,  'Greek: άλφα'),
         (10, 'Emoji: 😕');
go
SELECT * FROM teststrings;
go
-- Backslash + alphanumeric have special meanings, and we will look at
-- some of these. Here are escape sequences for control characters:
SELECT *, '\t' FROM teststrings WHERE regexp_like(str, '\t'); 
SELECT *, '\r' FROM teststrings WHERE regexp_like(str, '\r'); 
SELECT *, '\n' FROM teststrings WHERE regexp_like(str, '\n'); 
SELECT *, '\f' FROM teststrings WHERE regexp_like(str, '\f'); 
go
-- Escape sequences to match characters by their hex code. We need braces
-- for values > 0xFF.
SELECT *, '\xA0' FROM teststrings WHERE regexp_like(str, '\xA0');
SELECT *, '\x{3B1}' FROM teststrings WHERE regexp_like(str, '\x{3B1}$');
SELECT *, '\x{1F615}' FROM teststrings WHERE regexp_like(str, '\x{1F615}$');
go
-- There are a number of escape sequences for character classes.
-- "\s" is for white-space characters. It is ASCII only, so it does not
-- include the hard space. (Which, depending on the situation, may be 
-- what you want. Or not.)
SELECT * 
FROM   teststrings
WHERE  regexp_like(str, '\s$');
go
-- If you also want to include the hard space in your search for spaces, 
-- you can use brackets:
SELECT * 
FROM   teststrings
WHERE  regexp_like(str, '[\s\xA0]$');
go
-- For all the character-class sequences, the uppercase version is the 
-- negation. This lists all test strings not ending with a white space.
SELECT * 
FROM   teststrings
WHERE  regexp_like(str, '\S$');
go
-- That is, the same as:
SELECT * 
FROM   teststrings
WHERE  regexp_like(str, '[^\s]$');
go
-- \d matches digits, essentially a shortcut for [0-9]. 
SELECT id, str
FROM   teststrings
WHERE  regexp_like(str, '\d$');
go
-- Just like \S, \D is the negation of the set.
SELECT id, str
FROM   teststrings
WHERE  regexp_like(str, '\D$');
go
--------------------------------------------------------------------
-- Lesson seven.
-- We will now learn to recgnise letters and non-letters. For this purpose we
-- will revisit our sentences table. We reload it, now with proper punctuation, 
-- and one sentence added.
TRUNCATE TABLE sentences;
INSERT sentences (id, sentence)
  VALUES (1, 'Jack tells Steven to help Jill.'),
         (2, 'Stephen says: "Jill is in the store."'),
         (3, 'Laura hears Jack, Jill and Beth singing.'),
         (4, 'Jill is watching Laura playing tennis.'),
         (5, 'Laura is hoping for Stephen to drop in.'),
         (6, 'To whom did Laura give the jack of clubs?'),
         (7, 'Steven asks Jack "what is that good for?"'),
         (8, 'Beth and Laura is visiting Jack and Jill'),
         (9, 'Jack sends the résumé to Jill on HR.');

-- To match letters in general, we use \pL. (Observe case: lowercase "p" and
-- uppercase "L"). The below extracts all words in the sentences.
SELECT s.id, m.match_value
FROM   sentences s
CROSS  APPLY regexp_matches(s.sentence, '\pL+') m
ORDER  BY s.id, m.match_id;
go
-- \p stands for Unicode category of which there are a bunch. There are also
-- subcategories. For instance Lu for uppercase and Ll for lowercase. So
-- this lists all words with initial uppercase, followed by one or more 
-- lowercase.
SELECT s.id, m.match_value
FROM   sentences s
CROSS  APPLY regexp_matches(s.sentence, '\p{Lu}\p{Ll}+') m
ORDER  BY s.id, m.match_id;
go
-- Negation works here as well, so \PL matches any character that is not a
-- letter. Let's use this for a proper solution for some of the queries we 
-- looked at earlier. Find all sentences with Jack or Jill:
SELECT id, sentence
FROM   sentences
WHERE  regexp_like(sentence, '(^|\PL)(Jack|Jill)(\PL|$)');
go
-- All sentences where Jack comes before Jill. Rather than using space, we
-- use \PL to match any non-letter character.
SELECT id, sentence 
FROM   sentences 
WHERE  regexp_like(sentence, '(^|\PL)Jack(\PL.*\PL|\PL)Jill(\PL|$)');
go
-- Excerise for the reader: Change the pattern where we replaced Jack
-- with Steven between Laura and Jill to use \PL.
go
-- Finally, we will look at things to avoid. \w is supposed to match "word" 
-- characters: letters, digits and underscore. [[:alpha:]] a POSIX class 
-- which should match letters. But look at the result when we extract the 
-- words from sentence 9. (The example is a little funny, since we first 
-- split to table with regexp_matches to immediately combine the words to 
-- a single string with string_agg. That is only make the demo compact.)
SELECT string_agg(m.match_value, ' - ') WITHIN GROUP (ORDER BY m.match_id)
FROM   sentences s
CROSS  APPLY regexp_matches(s.sentence, '\pL+') m
WHERE  s.id = 9;

SELECT string_agg(m.match_value, ' - ') WITHIN GROUP (ORDER BY m.match_id)
FROM   sentences s
CROSS  APPLY regexp_matches(s.sentence, '\w+') m
WHERE  s.id = 9;

SELECT string_agg(m.match_value, ' - ') WITHIN GROUP (ORDER BY m.match_id)
FROM   sentences s
CROSS  APPLY regexp_matches(s.sentence, '[[:alpha:]]+') m
WHERE  s.id = 9;
go
---------------------------------------------------
-- Lesson Eight (not featured in the presentation).
-- This lesson gives some tips about multi-line strings.
DROP TABLE IF EXISTS multiline;
CREATE TABLE multiline(id   int,
                       text varchar(2000),
                       CONSTRAINT pk_multiline PRIMARY KEY (id));
go
-- Yes, that is the first half of the first verse of Smoke on the Water!
INSERT multiline(id, text)
   VALUES(1, 
'We all came down to Montreux
On the Lake Geneva shorline
To make records with a mobile
We did not have much time');
go
-- We expect this to return the row, since we are looking for something
-- that starts with "We" and ends in "time".
SELECT id, text
FROM   multiline
WHERE  regexp_like(text, '^We.*time$');
-- But it doesn't! You see, the dot actually stands for "any character 
-- but newline".
go
-- We can save the show with an alternate:
SELECT id, text
FROM   multiline
WHERE  regexp_like(text, '^We(.|\n)*time$');
go
-- This can get bulky, if we need to do it many times. For this reason, we
-- we can use the "s" flag which means "dot also matches newline".
SELECT id, text
FROM   multiline
WHERE  regexp_like(text, '^We.*time$', 's');
go
-- We can combine the "i" and "s" flags:
SELECT id, text
FROM   multiline
WHERE  regexp_like(text, '^WE.*TIME$', 'is');
go
-- At this point open multiline.sql to load the second half or the 
-- Smoke on the Water verse.
go
-- Split the text into lines. In Windows CR-LF is the line separator, so
-- \r\n seems like the right thing:
SELECT ml.id, s.value, s.ordinal
FROM   multiline ml
CROSS  APPLY regexp_split_to_table(ml.text, '\r\n') s
ORDER  BY ml.id, s.ordinal;
-- But second row does not split!
go
-- Multiline.sql is saved in Unix format, and on Unix lines are terminated
-- by LF (\n) alone. You can never be sure what format you have, so when
-- splitting into lines, use this pattern to make the CR optional.
SELECT ml.id, s.value, s.ordinal
FROM   multiline ml
CROSS  APPLY regexp_split_to_table(ml.text, '\r?\n') s
ORDER  BY ml.id, s.ordinal;
-- If you also want to cover (old) Mac format, where lines are separated by
-- CR alone, this is left as an exercise to the reader.

